Setup and data loading

library(tidyverse)
library(ggplot2)
library(dplyr)
library(readr)
library(ggthemes)
library(caret)
library(randomForest)
library(pROC)
library(car)
library(broom)
library(reshape2)
library(plotly)
library(patchwork) 
library(tidyr)
library(corrplot)
library(glmnet)
library(randomForest)
raw_df <- read_csv("../dataset/FoodAccessResearchAtlasData2019.csv")

str((raw_df))
df_fixed <- raw_df %>%
  mutate(across(where(is.character), ~ na_if(., "NULL")))

missing_pct <- colMeans(is.na(df_fixed))

df_clean <- df_fixed[, missing_pct <= 0.10]

str(df_clean)

SMART Q1

Where are food deserts most geographically concentrated across the U.S., and how do these concentrations differ between urban and rural census tracts?

According to the USDA definitions, a “food desert” is typically a low-income tract that also has low access to supermarkets based on established distance criteria. Here, we use the ‘LILATracts_1And10’ column (which applies a 1-mile threshold for urban and a 10-mile threshold for rural areas) as an indicator. We assume that a value of 1 in ‘LILATracts_1And10’ indicates that the tract qualifies as a food desert.

df$FoodDesert <- as.numeric(as.character(df$LILATracts_1And10))

state_urban_counts <- df %>%
  group_by(State, Urban) %>%
  summarise(FoodDesert = sum(FoodDesert, na.rm = TRUE), .groups = 'drop')

food_desert_pivot <- state_urban_counts %>%
  pivot_wider(names_from = Urban, values_from = FoodDesert, values_fill = 0) %>%
  rename(`Rural Food Desert Count` = `0`, `Urban Food Desert Count` = `1`) %>%
  mutate(`Total Food Desert Count` = `Rural Food Desert Count` + `Urban Food Desert Count`) %>%
  arrange(desc(`Total Food Desert Count`))

head(food_desert_pivot, 10)
state_totals <- df %>%
  group_by(State) %>%
  summarise(
    TotalFoodDeserts = sum(FoodDesert, na.rm = TRUE)
  ) %>%
  arrange(desc(TotalFoodDeserts))


state_totals_top10 <- head(state_totals, 10)

ggplot(state_totals_top10, aes(x = reorder(State, TotalFoodDeserts), y = TotalFoodDeserts)) +
  geom_bar(stat = "identity", fill = "coral") +
  coord_flip() +
  labs(
    title = "Top 10 States by Number of Food Deserts",
    x = "State",
    y = "Number of Food Deserts"
  ) +
  theme_minimal(base_size = 14)

state_summary <- df %>%
  group_by(State, Urban) %>%
  summarise(
    TotalFoodDeserts = sum(FoodDesert, na.rm = TRUE),
    TotalTracts = n(),
    FoodDesertPercentage = 100 * TotalFoodDeserts / TotalTracts,
    .groups = 'drop'
  )

urban_summary <- state_summary %>%
  filter(Urban == 1) %>%
  arrange(desc(FoodDesertPercentage))

rural_summary <- state_summary %>%
  filter(Urban == 0) %>%
  arrange(desc(FoodDesertPercentage))

urban_plot <- ggplot(urban_summary, aes(x = reorder(State, FoodDesertPercentage), y = FoodDesertPercentage)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(
    title = "Urban Food Desert % by State",
    x = "State",
    y = "Urban Desert %"
  ) +
  theme_minimal(base_size = 12)

rural_plot <- ggplot(rural_summary, aes(x = reorder(State, FoodDesertPercentage), y = FoodDesertPercentage)) +
  geom_bar(stat = "identity", fill = "seagreen") +
  coord_flip() +
  labs(
    title = "Rural Food Desert % by State",
    x = "State",
    y = "Rural Desert %"
  ) +
  theme_minimal(base_size = 12)

urban_plot + rural_plot + plot_layout(ncol = 2)

By far Texas has the most food desert tracts, but it also has a large number of total tracts. Mississippi has the highest percentage of food desert tracts, with an astonishing 32% of its tracts being food deserts. Mississppi also has the highest percentage of food desert tracts in urban areas which contribute to 42% of its total tracts. Arizona and alaska share the highest percentage of food desert tracts in rural areas, with 29%.

SMART Q2

What demographics and socioeconomic factors are linked to food deserts?

Data Cleaning and imputation of the combined data source

foodatlas <- read_csv("../dataset/FoodAccessResearchAtlasData2019.csv", col_types = cols(CensusTract = col_character()))
socioeconomic <- read_csv("../dataset/FE_socioeconomic.csv")
insecurity <- read_csv("../dataset/FE_insecurity.csv")
health <- read_csv("../dataset/FE_health.csv")
stores <- read_csv("../dataset/FE_stores.csv")
restaurants <- read_csv("../dataset/FE_restaurants.csv")
taxes <- read_csv("../dataset/FE_taxes.csv")
local <- read_csv("../dataset/FE_local.csv")
access <- read_csv("../dataset/FE_access.csv")
state_data <- read_csv("../dataset/FE_supplemental_data_state.csv")
county_data <- read_csv("../dataset/FE_supplemental_data_county.csv")


clean_nulls <- function(df) {
  df %>% mutate(across(where(is.character), ~ na_if(., "NULL")))
}

foodatlas <- clean_nulls(foodatlas)
socioeconomic <- clean_nulls(socioeconomic)
insecurity <- clean_nulls(insecurity)
health <- clean_nulls(health)
stores <- clean_nulls(stores)
restaurants <- clean_nulls(restaurants)
taxes <- clean_nulls(taxes)
local <- clean_nulls(local)
access <- clean_nulls(access)
state_data <- clean_nulls(state_data)
county_data <- clean_nulls(county_data)


foodatlas <- foodatlas %>%
  mutate(CensusTract = str_pad(CensusTract, 11, pad = "0"),
         CountyFIPS = substr(CensusTract, 1, 5))

socioeconomic <- socioeconomic %>% mutate(CountyFIPS = str_pad(FIPS, 5, pad = "0"))
insecurity <- insecurity %>% mutate(CountyFIPS = str_pad(FIPS, 5, pad = "0"))
health <- health %>% mutate(CountyFIPS = str_pad(FIPS, 5, pad = "0"))
stores <- stores %>% mutate(CountyFIPS = str_pad(FIPS, 5, pad = "0"))
restaurants <- restaurants %>% mutate(CountyFIPS = str_pad(FIPS, 5, pad = "0"))
taxes <- taxes %>% mutate(CountyFIPS = str_pad(FIPS, 5, pad = "0"))
local <- local %>% mutate(CountyFIPS = str_pad(FIPS, 5, pad = "0"))
access <- access %>% mutate(CountyFIPS = str_pad(FIPS, 5, pad = "0"))
county_data <- county_data %>% mutate(CountyFIPS = str_pad(FIPS, 5, pad = "0"))

# Merge everything into foodatlas
merged_df <- foodatlas %>%
  left_join(socioeconomic, by = "CountyFIPS") %>%
  left_join(insecurity, by = "CountyFIPS") %>%
  left_join(health, by = "CountyFIPS") %>%
  left_join(stores, by = "CountyFIPS") %>%
  left_join(restaurants, by = "CountyFIPS") %>%
  left_join(taxes, by = "CountyFIPS") %>%
  left_join(local, by = "CountyFIPS") %>%
  left_join(access, by = "CountyFIPS") %>%
  left_join(county_data, by = "CountyFIPS")


merged_df$FoodDesert <- as.numeric(as.character(merged_df$LILATracts_1And10))


predictor_vars <- merged_df %>%
  select(-FoodDesert) %>%
  select(where(is.numeric))    
  

# Drop columns with >10% missing
missing_pct <- colMeans(is.na(predictor_vars))
predictor_vars <- predictor_vars[, missing_pct <= 0.10]

# Impute remaining NA with medians
predictor_vars <- predictor_vars %>%
  mutate(across(everything(), ~ ifelse(is.na(.), median(., na.rm = TRUE), .)))

is_binary <- function(x) {
  unique_vals <- unique(na.omit(x))
  length(unique_vals) == 2 && all(sort(unique_vals) %in% c(0, 1))
}

binary_cols <- names(predictor_vars)[sapply(predictor_vars, is_binary)]
continuous_cols <- setdiff(names(predictor_vars), binary_cols)

# Convert binary variables to factor
predictor_vars <- predictor_vars %>%
  mutate(across(all_of(binary_cols), as.factor))

# Scale continuous variables
predictor_vars <- predictor_vars %>%
  mutate(across(all_of(continuous_cols), scale))

# Prepare modeling data
model_data <- bind_cols(
  FoodDesert = merged_df$FoodDesert,
  predictor_vars
) %>% drop_na()

set.seed(97)
model_data_sampled <- model_data %>% sample_frac(0.3)

str(model_data_sampled)

Iterative feature selection process starts here

predictor_vars <- model_data_sampled %>%
  select(-c(FoodDesert,LILATracts_1And10,LILATracts_1And20,LILATracts_halfAnd10,`2010_Census_Population`)) %>% 
  select(where(is.numeric))
  

y <- model_data_sampled$FoodDesert


univariate_results <- lapply(names(predictor_vars), function(var) {
  temp_formula <- as.formula(paste("FoodDesert ~", var))
  model <- glm(temp_formula, data = model_data_sampled, family = binomial())
  tidy(model) %>%
    filter(term != "(Intercept)") %>%
    mutate(variable = var)
})


univariate_df <- do.call(rbind, univariate_results)


selected_vars_univariate <- univariate_df %>%
  filter(p.value < 0.05) %>%
  pull(variable)

cat("Variables selected after univariate screening:", length(selected_vars_univariate), "\n")
print(selected_vars_univariate)
X_lasso <- model.matrix(
  as.formula(paste("~", paste0("`", selected_vars_univariate, "`", collapse = " + "))),
  data = model_data_sampled
)[, -1]  # remove intercept

y_lasso <- model_data_sampled$FoodDesert

# Lasso logistic regression
set.seed(72)
lasso_fit <- cv.glmnet(X_lasso, y_lasso, family = "binomial", alpha = 1)


best_lambda <- lasso_fit$lambda.min


lasso_coefs <- coef(lasso_fit, s = best_lambda)
selected_lasso_vars <- rownames(lasso_coefs)[lasso_coefs[, 1] != 0]
selected_lasso_vars <- selected_lasso_vars[selected_lasso_vars != "(Intercept)"]

cat("Variables selected after Lasso:", length(selected_lasso_vars), "\n")
print(selected_lasso_vars)
remove_high_vif_iteratively <- function(data, target = "FoodDesert", threshold = 5) {
  

  current_vars <- setdiff(names(data), target)
  
  repeat {
    
    temp_formula <- as.formula(paste(target, "~", paste0("`", current_vars, "`", collapse = " + ")))
    
   
    temp_model <- glm(temp_formula, data = data, family = binomial())
    
    
    vif_values <- vif(temp_model)
    
 
    if (max(vif_values, na.rm = TRUE) < threshold) {
      break
    }
    
   
    worst_var <- names(which.max(vif_values))
    
    cat("Removing variable:", worst_var, "VIF =", round(max(vif_values), 2), "\n")
    
  
    current_vars <- setdiff(current_vars, worst_var)
  }
  
  return(current_vars)
}

model_data_reduced <- model_data_sampled %>%
  select(all_of(c(selected_lasso_vars, "FoodDesert"))) %>%
  drop_na()

# Run VIF-based iterative filtering
final_selected_vars <- remove_high_vif_iteratively(model_data_reduced, target = "FoodDesert", threshold = 5)


cat("Final number of predictors:", length(final_selected_vars), "\n")
print(final_selected_vars)
remove_high_pvalue_iteratively <- function(data, target = "FoodDesert", threshold = 0.05) {
  
 
  current_vars <- setdiff(names(data), target)
  
  repeat {
   
    temp_formula <- as.formula(paste(target, "~", paste0("`", current_vars, "`", collapse = " + ")))
    
    
    temp_model <- glm(temp_formula, data = data, family = binomial())
    

    model_summary <- tidy(temp_model)
    
    # Remove intercept row
    model_summary <- model_summary %>%
      filter(term != "(Intercept)")
    
    # Check max p-value
    max_pval <- max(model_summary$p.value, na.rm = TRUE)
    
    if (max_pval < threshold) {
      break
    }
    
   
    worst_var <- model_summary %>%
      filter(p.value == max_pval) %>%
      pull(term) %>%
      gsub("`", "", .)  # remove backticks
    
    cat("Removing variable:", worst_var, "p-value =", round(max_pval, 4), "\n")
    
   
    current_vars <- setdiff(current_vars, worst_var)
  }
  
  return(current_vars)
}


model_data_reduced <- model_data_sampled %>%
  select(all_of(c(final_selected_vars, "FoodDesert"))) %>%
  drop_na()

final_significant_vars <- remove_high_pvalue_iteratively(model_data_reduced, target = "FoodDesert", threshold = 0.01)


cat("Final number of predictors after p-value filtering:", length(final_significant_vars), "\n")
print(final_significant_vars)


final_formula <- as.formula(paste("FoodDesert ~", paste0("`", final_significant_vars, "`", collapse = " + ")))
final_model <- glm(final_formula, data = model_data_reduced, family = binomial())

Final logit-model summary

## 
## Call:
## glm(formula = final_formula, family = binomial(), data = model_data_reduced)
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             -2.4968     0.0314  -79.56  < 2e-16 ***
## NUMGQTRS                 0.1605     0.0196    8.17  3.2e-16 ***
## PovertyRate              0.4851     0.0289   16.79  < 2e-16 ***
## TractKids                0.2417     0.0379    6.38  1.8e-10 ***
## TractSeniors             0.2166     0.0308    7.03  2.1e-12 ***
## TractWhite              -0.5274     0.0409  -12.91  < 2e-16 ***
## TractAsian              -0.3717     0.0528   -7.04  1.9e-12 ***
## TractAIAN                0.0965     0.0296    3.26  0.00113 ** 
## TractSNAP                0.3322     0.0301   11.03  < 2e-16 ***
## PCT_NHNA10              -0.1189     0.0312   -3.81  0.00014 ***
## POVRATE15               -0.2068     0.0312   -6.63  3.4e-11 ***
## CH_FOODINSEC_14_17      -0.1470     0.0326   -4.51  6.4e-06 ***
## PCT_OBESE_ADULTS12      -0.0951     0.0270   -3.52  0.00043 ***
## GROCPTH11               -0.1915     0.0374   -5.12  3.1e-07 ***
## SUPERCPTH16              0.1005     0.0224    4.48  7.3e-06 ***
## CONVSPTH16               0.0886     0.0291    3.04  0.00234 ** 
## SPECSPTH11              -0.0944     0.0339   -2.78  0.00541 ** 
## FSRPTH11                 0.0764     0.0292    2.62  0.00886 ** 
## PCT_LOCLFARM07          -0.1869     0.0522   -3.58  0.00035 ***
## PCT_LOCLSALE12          -0.0917     0.0338   -2.71  0.00674 ** 
## PC_DIRSALES07            0.0726     0.0222    3.27  0.00106 ** 
## FMRKT13                 -0.1886     0.0441   -4.28  1.9e-05 ***
## PCT_FMRKT_SNAP18         0.0788     0.0274    2.88  0.00400 ** 
## PCT_FMRKT_CREDIT18      -0.0816     0.0264   -3.09  0.00199 ** 
## PCT_LACCESS_LOWI10       0.2188     0.0346    6.32  2.6e-10 ***
## PCH_LACCESS_HHNV_10_15   0.3866     0.1112    3.48  0.00051 ***
## PCT_LACCESS_HHNV10       0.1741     0.0313    5.57  2.6e-08 ***
## PCT_LACCESS_SNAP15       0.1822     0.0367    4.97  6.7e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 16498  on 21758  degrees of freedom
## Residual deviance: 13145  on 21731  degrees of freedom
## AIC: 13201
## 
## Number of Fisher Scoring iterations: 6
## [1] "VIF values of the coefficients:"
##          PC_DIRSALES07               NUMGQTRS             TractAsian 
##                   1.16                   1.20                   1.20 
##            SUPERCPTH16 PCH_LACCESS_HHNV_10_15     CH_FOODINSEC_14_17 
##                   1.21                   1.23                   1.27 
##         PCT_LOCLSALE12     PCT_OBESE_ADULTS12                FMRKT13 
##                   1.30                   1.40                   1.52 
##              GROCPTH11             SPECSPTH11         PCT_LOCLFARM07 
##                   1.52                   1.53                   1.55 
##               FSRPTH11       PCT_FMRKT_SNAP18     PCT_FMRKT_CREDIT18 
##                   1.56                   1.69                   1.73 
##             CONVSPTH16           TractSeniors              POVRATE15 
##                   1.81                   1.86                   1.90 
##     PCT_LACCESS_HHNV10            PovertyRate              TractAIAN 
##                   1.94                   2.03                   2.18 
##             PCT_NHNA10              TractSNAP              TractKids 
##                   2.26                   2.34                   2.77 
##     PCT_LACCESS_LOWI10             TractWhite     PCT_LACCESS_SNAP15 
##                   2.85                   3.09                   3.16

Top 5 features by abs correlation value

final_vars_with_target <- c(final_significant_vars, "FoodDesert")


final_data <- model_data_reduced %>%
  select(all_of(final_vars_with_target))


final_data_numeric <- final_data %>%
  mutate(across(everything(), ~ as.numeric(as.character(.))))


cor_matrix_final <- cor(final_data_numeric, use = "complete.obs", method = "spearman")


cor_with_target <- data.frame(
  Variable = rownames(cor_matrix_final),
  Correlation = cor_matrix_final[, "FoodDesert"]
)


cor_with_target <- cor_with_target %>%
  filter(Variable != "FoodDesert")


cor_with_target <- cor_with_target %>%
  mutate(abs_correlation = abs(Correlation))


top10_features <- cor_with_target %>%
  arrange(desc(abs_correlation)) %>%
  slice(1:5)

top10_var_names <- top10_features$Variable


cor_matrix_top10 <- cor_matrix_final[c(top10_var_names,"FoodDesert"), c(top10_var_names,"FoodDesert")]


corrplot(
  cor_matrix_top10,
  method = "color",
  type = "upper",
  addCoef.col = "black",
  tl.col = "black",
  tl.srt = 45,
  number.cex = 0.8,
  tl.cex = 0.9,
  mar = c(0,0,2,0)
)

boxplot_data <- merged_df %>%
  mutate(FoodDesert = as.factor(FoodDesert)) %>% 
  select(c(top10_var_names,"FoodDesert"))
 


for (var in top10_var_names) { 
  p <- ggplot(boxplot_data, aes(x = FoodDesert, y = .data[[var]])) +
    geom_boxplot(fill = "green") +
    labs(
      title = paste("Boxplot of", var, "by Food Desert Status"),
      x = "Food Desert (0 = No, 1 = Yes)",
      y = var
    ) +
    theme_minimal(base_size = 14)
  
  print(p)
}

It looks like PovertyRate,Vehicle access and SNAP status are the most prominent factors

SMART Q3

SMART Q4

Can we develop a predictive model that accurately identifies high-risk areas likely to be or become food deserts based on social and economic indicators?

Logit-model performance metrics

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 18676  2321
##          1   337   425
##                                         
##                Accuracy : 0.878         
##                  95% CI : (0.873, 0.882)
##     No Information Rate : 0.874         
##     P-Value [Acc > NIR] : 0.0366        
##                                         
##                   Kappa : 0.198         
##                                         
##  Mcnemar's Test P-Value : <2e-16        
##                                         
##             Sensitivity : 0.1548        
##             Specificity : 0.9823        
##          Pos Pred Value : 0.5577        
##          Neg Pred Value : 0.8895        
##              Prevalence : 0.1262        
##          Detection Rate : 0.0195        
##    Detection Prevalence : 0.0350        
##       Balanced Accuracy : 0.5685        
##                                         
##        'Positive' Class : 1             
## 
## 
## --- Model Performance Metrics ---
## Accuracy : 0.878
## Sensitivity (Recall): 0.155
## Specificity: 0.982
## Precision : 0.558
## F1 Score : 0.242
roc_obj <- roc(actual_class, predicted_probs)

# Plot ROC
plot(roc_obj, main = "ROC Curve for Food Desert Prediction", col = "blue", lwd = 2, print.auc = TRUE)

auc_value <- auc(roc_obj)

cat("\nAUC:", round(auc_value, 3), "\n")

Random forest performance metrics

## 
## Call:
##  randomForest(formula = FoodDesert ~ ., data = rf_data, ntree = 500,      mtry = floor(sqrt(length(final_selected_vars))), importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 8
## 
##         OOB estimate of  error rate: 12.3%
## Confusion matrix:
##       0   1 class.error
## 0 18608 405      0.0213
## 1  2264 482      0.8245
rf_probs <- predict(rf_model, type = "prob")[,2] 

rf_predicted_class <- predict(rf_model, type = "response")

rf_actual_class <- rf_data$FoodDesert


rf_conf_matrix <- confusionMatrix(rf_predicted_class, rf_actual_class, positive = "1")
print(rf_conf_matrix)


rf_accuracy <- rf_conf_matrix$overall['Accuracy']
rf_sensitivity <- rf_conf_matrix$byClass['Sensitivity']
rf_specificity <- rf_conf_matrix$byClass['Specificity']
rf_precision <- rf_conf_matrix$byClass['Precision']
rf_f1 <- rf_conf_matrix$byClass['F1']

cat("\n--- Random Forest Performance Metrics ---\n")
cat("Accuracy :", round(rf_accuracy, 3), "\n")
cat("Sensitivity (Recall):", round(rf_sensitivity, 3), "\n")
cat("Specificity:", round(rf_specificity, 3), "\n")
cat("Precision :", round(rf_precision, 3), "\n")
cat("F1 Score :", round(rf_f1, 3), "\n")

# ROC Curve and AUC
rf_roc_obj <- roc(rf_actual_class, rf_probs)

plot(rf_roc_obj, main = "Random Forest ROC Curve", col = "darkgreen", lwd = 2, print.auc = TRUE)

rf_auc <- auc(rf_roc_obj)

cat("\nAUC:", round(rf_auc, 3), "\n")
rf_var_imp <- importance(rf_model)

rf_var_imp_df <- data.frame(
  Variable = rownames(rf_var_imp),
  Importance = rf_var_imp[, "MeanDecreaseGini"]
)

rf_var_imp_df <- rf_var_imp_df %>%
  mutate(
    ImportancePct = 100 * Importance / sum(Importance)  # Convert to %
  ) %>%
  arrange(desc(ImportancePct))  # Sort descending


top10_rf_var_imp_df <- rf_var_imp_df %>%
  slice(1:10)


ggplot(top10_rf_var_imp_df, aes(x = reorder(Variable, ImportancePct), y = ImportancePct)) +
  geom_bar(stat = "identity", fill = "coral") +
  geom_text(aes(label = paste0(round(ImportancePct, 1), "%")), 
            hjust = -0.1, size = 4) +  # Add % labels outside bars
  coord_flip() +
  labs(
    title = "Top 10 Feature Importances (Random Forest)",
    x = "Variable",
    y = "Importance (%)"
  ) +
  theme_minimal(base_size = 14) +
  ylim(0, max(top10_rf_var_imp_df$ImportancePct) * 1.2)  

The random forest performs slightly better as it captures the non-linear relationship of the variables more accurately. As for the features, PovertyRate and TractSNAP stand out which align with the correlation matrix we saw earlier. Further analysis using shapley values should be conducted to examine the full effect of these variables. Additionally, it is recommended to build multiple models on the entire feature list and then iteratively deduce the features separately instead of using the filtered features from another model. This reduces any model bias.